Project description

Effect of cohesin looping on genome architecture, from the perspective of the nuclear lamina.

Introduction

I have previously correlated DamID changes of LADs with size and score. Here, I will extend these comparisons with more features. Including:

(1. LAD score) 2. LAD size 3. (Active) gene density 4. CTCF density 5. Distance to centromeres (left arm) 6. Distance to telomeres (right arm) 7. Local LAD density 8. …

Method

Generate a heatmap with spearman correlations. I might change this into something more sophisticated later to account for feature correlations, for example linear modeling.

Set-up

Load the libraries and set the parameters.

# Load dependencies
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggbeeswarm))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(broom))
suppressPackageStartupMessages(library(corrr))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(ggrastr))

# Prepare output 
output.dir <- "ts220113_LAD_changes_versus_features"
dir.create(output.dir, showWarnings = FALSE)

# Load input
input.dir <- "ts220113_DamID_changes_versus_LAD_size_and_score"
gr_LADs_list <- readRDS(file.path(input.dir, "LADs_list.rds"))
gr_LAD_consensus <- readRDS(file.path(input.dir, "LADs_consensus.rds"))

gr_ctcf <- import("Data_NQ/ChIP_NQ/CohesinFactors/2_Wapl-0D-antiCtcf_sampleOnly_peaks.narrowPeak")

input.dir <- "ts220113_GeneExpression"
gr_genes <- readRDS(file.path(input.dir, "genes.rds"))
tib_genes <- readRDS(file.path(input.dir, "genes_fpkm_mean.rds"))

input.dir <- "ts220113_effect_of_CTCF_depletion_on_LAD_borders"
gr_damid <- readRDS(file.path(input.dir, "damid.rds"))
tib_metadata <- readRDS(file.path(input.dir, "metadata_damid.rds"))

Prepare knitr.

library(knitr)
opts_chunk$set(fig.width = 10, fig.height = 4, cache = T,
               message = F, warning = F,
               dev=c('png', 'pdf'), fig.path = file.path(output.dir, "figures/")) 
pdf.options(useDingbats = FALSE)

Functions.

1. Get all parameters

First, I will add all the features to the LADs.

gr_LADs_list <- c(gr_LADs_list, list(consensus = gr_LAD_consensus))

1.1 LAD score

This is already included in the LAD list: the first column.

gr_LADs_list <- purrr::map(gr_LADs_list,
                           function(x) {
                             #x$score <- mcols(x)[, 1]
                             x
                           })

1.2 LAD size

Size in bases. In log2 with a pseudocount of 1.

gr_LADs_list <- purrr::map(gr_LADs_list,
                           function(x) {
                             x$size <- log2(width(x) + 1)
                             x
                           })

1.3 Active gene density

Active genes > 1 FPKM. Count overlap, irrespective of gene length. In log2 with a pseudocount of 1.

# Extend
extend <- 0e5

# Determine active genes
cutoff <- 1
idx <- tib_genes$WT_0h > cutoff
genes.active <- gr_genes[idx]
genes.inactive <- gr_genes[! idx]

gr_LADs_list <- purrr::map(gr_LADs_list,
                           function(x) {
                             # Get number of overlapping genes 
                             # - including 1 base overlap
                             x.extend <- x
                             start(x.extend) <- start(x) - extend
                             end(x.extend) <- end(x) + extend
                             tmp <- countOverlaps(x.extend, genes.active,
                                                  ignore.strand = T)
                             # Convert to number / Mb - active genes
                             tmp <- tmp / (width(x)/1e6)
                             x$active_gene_density <- log2(tmp + 1)
                             # tmp <- countOverlaps(x.extend, genes.inactive,
                             #                      ignore.strand = T)
                             # # Convert to number / Mb - inactive genes
                             # tmp <- tmp / (width(x)/1e6)
                             # x$inactive_gene_density <- log2(tmp + 1)
                             x
                           })

1.4 CTCF density

CTCF peak density, in log2 with a pseudocount of 1.

# Determine strong ctcf peaks
#gr_ctcf_filtered <- gr_ctcf[gr_ctcf$score > 100]
gr_ctcf_filtered <- gr_ctcf

gr_LADs_list <- purrr::map(gr_LADs_list,
                           function(x) {
                             # Get number of overlapping peaks 
                             # - including 1 base overlap
                             x.extend <- x
                             start(x.extend) <- start(x) - extend
                             end(x.extend) <- end(x) + extend
                             tmp <- countOverlaps(x.extend, gr_ctcf_filtered, 
                                                  ignore.strand = T)
                             # Convert to number / Mb
                             tmp <- tmp / (width(x)/1e6)
                             x$ctcf_density <- log2(tmp + 1)
                             x
                           })

1.5 Distance to centromeres

Distance to centromere (first base of the genome). Use middle of LADs to determine distance. Distance in Mb.

gr_LADs_list <- purrr::map(gr_LADs_list,
                           function(x) {
                             x.mid <- x
                             start(x.mid) <- end(x.mid) <- (start(x.mid) + end(x.mid))/2
                             x$distance_to_centromere <- start(x.mid) / 1e6
                             x
                           })

1.6 Distance to telomeres

Distance to centromere (last base of the genome). Use middle of LADs to determine distance. Distance in Mb.

gr_chromosome <- read_tsv("~/mydata/data/genomes/mm10/mm10.chrom.sizes", 
                          col_names = c("seqnames", "start")) %>%
  mutate(end = start) %>%
  as(., "GRanges")

gr_LADs_list <- purrr::map(gr_LADs_list,
                           function(x) {
                             x.mid <- x
                             start(x.mid) <- end(x.mid) <- (start(x.mid) + end(x.mid))/2
                             dis <- distanceToNearest(x.mid, gr_chromosome)
                             x$distance_to_telomere <- mcols(dis)$distance / 1e6
                             x
                           })

1.7 Local LAD density

This one is the most difficult. What is “local” here, should I count weak LADs as much as strong LADs and should I include the LAD itself? For now, I will take a random size around each LAD (say, 15Mb) and simply count LADs.

# Get bins
idx <- which(rowSums(! is.na(as_tibble(mcols(gr_damid)))) > 0)
bins <- gr_damid[idx]
mcols(bins) <- NULL

gr_LADs_list <- purrr::map(gr_LADs_list,
                           function(x) {
                             x.extend <- resize(x, 30e6, fix = "center")
                             # Get LAD bins
                             bins.lads <- bins[bins %over% x]
                             ovl.all <- countOverlaps(x.extend, bins)
                             ovl.bins <- countOverlaps(x.extend, bins.lads)
                             
                             x$local_lad_density <- ovl.bins / ovl.all
                             x
                           })

1.8 Chromosome size

Size of the chromosomes.

# Chromosome size
chrom_sizes <- read_tsv("~/mydata/data/genomes/mm10/mm10.chrom.sizes",
                        col_names = c("seqnames", "length"))

gr_LADs_list <- purrr::map(gr_LADs_list,
                           function(x) {
                             idx <- match(as.character(seqnames(x)), 
                                          chrom_sizes$seqnames)
                             x$chrom_size <- chrom_sizes$length[idx] / 1e6
                             x
                           })

1.9 Histone modifications

Average histone modifications for the LADs.

# Prepare output directory
bedfile.dir <- file.path(output.dir, "bed_files")
dir.create(bedfile.dir, showWarnings = F)

deeptools.dir <- file.path(output.dir, "deeptools")
dir.create(deeptools.dir, showWarnings = F)

# Prepare histone modifications
track_dir <- "/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts181120_pADamID_mouse/analysis_CTCF_AID/Data_NQ/ChIP_NQ/HistoneModifications/Public_2i_ChIP/"

tracks <- grep("H3K27me1|H3K27me2|H3K9me3", 
               dir(track_dir, full.names = T, pattern = "E14_2i"),
               value = T, invert = T)

track_names <- str_remove(str_remove(basename(tracks),
                                     "E14_2i_"), "_.*")


# Coverage over LADs
gr_LADs_list <- purrr::map(names(gr_LADs_list),
                           function(x) {
                             
                             # Prepare bed file
                             gr_x <- gr_LADs_list[[x]]
                             bed_x <- file.path(bedfile.dir, paste0(x, ".bed"))
                             export.bed(gr_x, bed_x)
                             output_base_x <- file.path(deeptools.dir, x)
                             
                             # Calculate coverage
                             deeptools_call <- paste("/home/t.v.schaik/mydata/miniconda3/envs/deeptools/bin/multiBigwigSummary",
                                                     "BED-file",
                                                     "-b", paste(tracks, collapse = " "),
                                                     "--BED", bed_x,
                                                     "--outFileName", paste0(output_base_x, ".npz"),
                                                     "--outRawCounts", paste0(output_base_x, ".tab"))
                             
                             system(deeptools_call)
                             
                             # Read scores
                             tib_x <- read_tsv(paste0(output_base_x, ".tab"),
                                               comment = "#",
                                               col_names = c("seqnames", "start", "end",
                                                             track_names)) %>%
                               dplyr::select(-seqnames, -start, -end)
                             
                             mcols(gr_x) <- bind_cols(as_tibble(mcols(gr_x)),
                                                      tib_x)
                             
                             # Save data
                             gr_LADs_list[[x]] <- gr_x
                             
                           })

1.x Correlation between features

I have now gathered all kinds of features. For interpretation, it’s useful to know correlations between them, to prevent wrong interpretation. Use the customized ggpairs for this. Only for one set of LADs : CTCF-WAPL.

# From Fede:
# ggpairs custom functions
corColor <- function(data, mapping, color = I("black"), sizeRange = c(1, 3), ...) {

  x   <- eval_data_col(data, mapping$x)
  y   <- eval_data_col(data, mapping$y)
  r   <- cor(x, y, method = "spearman")
  #rt  <- format(r, digits = 3)
  rt  <- format(round(r, 2), nsmall = 2)
  tt  <- as.character(rt)
  cex <- max(sizeRange)

  # helper function to calculate a useable size
  percent_of_range <- function(percent, range) {
    percent * diff(range) + min(range, na.rm = TRUE)
  }

  # plot correlation coefficient
  p <- ggally_text(label = tt, mapping = aes(), xP = 0.5, yP = 0.5,
                   # size = I(percent_of_range(cex * abs(r), sizeRange)), 
                   size = 6, 
                   color = color, ...) +
    theme(panel.grid.minor=element_blank(),
          panel.grid.major=element_blank())

  corColors <- RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")[2:6]

  if (r <= boundaries[1]) {
    corCol <- corColors[1]
  } else if (r <= boundaries[2]) {
    corCol <- corColors[2]
  } else if (r < boundaries[3]) {
    corCol <- corColors[3]
  } else if (r < boundaries[4]) {
    corCol <- corColors[4]
  } else {
    corCol <- corColors[5]
  }

  p <- p +
    theme(panel.background = element_rect(fill = corCol))

  return(p)
}

customScatter <- function (data, mapping) 
{
    p <- ggplot(data = data, mapping) + 
      rasterize(geom_point(alpha = 0.1, size = 0.5,
                           shape = 20),
                dpi = 300) +
      #geom_smooth(method = "lm", se = T, col = "red") +
      theme_bw()
    
    p 
}


# Levels to plot
feature_levels <- c("score", 
                    "size", "ctcf_density", "active_gene_density", 
                    "inactive_gene_density", 
                    #"H3K4me1", "H3K27ac", 
                    "H3K27me3", "H3K9me2",
                    "local_lad_density", 
                    "distance_to_centromere", "distance_to_telomere", 
                    "chrom_size")


# Prepare data frame, remove NAs for now
tib <- as_tibble(mcols(gr_LADs_list[[1]])) %>%
  dplyr::select(-matches("_[0-9]+h"))

# Use GGally to make correlation plots
boundaries <- seq(from = -0.3, to = 0.3, length.out = 4)

set.seed(1)
plt <- ggpairs(tib %>% 
                 drop_na() %>%
                 dplyr::select(one_of(feature_levels)),
               upper = list(continuous = corColor),
               lower = list(continuous = customScatter),
               diag = list(continuous = function(data, mapping, ...) {
                   ggally_densityDiag(data = data, mapping = mapping, alpha = 0.3, fill = "red") +
                   theme_bw()})) +
  ggtitle("Correlations LAD features") +
  xlab("") +
  ylab("")

print(plt)

# # Repeat, only LADs with size > 2e5
# plt <- ggpairs(tib[width(gr_LADs_list[[1]]) > 2e5, ] %>% drop_na(),
#                upper = list(continuous = corColor),
#                lower = list(continuous = customScatter),
#                diag = list(continuous = function(data, mapping, ...) {
#                    ggally_densityDiag(data = data, mapping = mapping, alpha = 0.3, fill = "red") +
#                    theme_bw()})) +
#   ggtitle("Correlations LAD features") +
#   xlab("") +
#   ylab("")
# 
# print(plt)

As supplementary figure, this will show that most features are mostly uncorrelated.

Finally, select the consensus model LAD definition. This is the only important model.

gr_LAD_consensus <- gr_LADs_list[[length(gr_LADs_list)]]
gr_LADs_list <- gr_LADs_list[-length(gr_LADs_list)]

2. Correlations

Now, I want to correlate everything. Include a scatterplot for every correlation. Use the Kendall Tau correlation, which is supposedly better at handling ties: https://stackoverflow.com/questions/10711395/spearman-correlation-and-ties.

Instead, I decided to use the Spearman correlation, which may be less suited but much more widely-used and appreciated.

# Change feature levels
feature_levels <- rev(feature_levels)

Correlations <- function(gr, make_plot = T, min_width = 2e5) {
  
  # Prepare tibble - gathered
  #gr_filtered <- gr[width(gr) > min_width]
  #gr_filtered <- gr[gr$active_gene_density != 0 & gr$ctcf_density != 0]
  
  tib <- as_tibble(mcols(gr)) %>%
    rename_at(1, ~ "t_0h") %>%
    mutate_at(vars(matches("_[0-9]+h")), function(x) x - .$t_0h) %>%
    dplyr::select(-1) %>%
    gather(key_timepoint, timepoint_score, matches("_[0-9]+h")) %>%
    gather(key_feature, feature_score, -contains("timepoint")) %>%
    filter(key_feature %in% feature_levels) %>%
    mutate(key_timepoint = factor(key_timepoint, levels = unique(key_timepoint)),
           key_feature = factor(key_feature, levels = feature_levels))
  
  # tib_filtered <- as_tibble(mcols(gr_filtered)) %>%
  #   rename_at(1, ~ "t_0h") %>%
  #   mutate_at(vars(matches("_[0-9]+h")), function(x) x - .$t_0h) %>%
  #   dplyr::select(-1) %>%
  #   gather(key_timepoint, timepoint_score, matches("_[0-9]+h")) %>%
  #   gather(key_feature, feature_score, -contains("timepoint")) %>%
  #   mutate(key_timepoint = factor(key_timepoint, levels = unique(key_timepoint)),
  #          key_feature = factor(key_feature, levels = feature_levels))
  
  # Prepare plot
  plt <- tib %>%
    mutate(key_feature = factor(key_feature, 
                                levels = rev(levels(key_feature)))) %>%
    ggplot(aes(x = feature_score, y = timepoint_score)) +
    geom_point(size = 0.15) +
    geom_hline(yintercept = 0, col = "blue", linetype = "dashed") +
    geom_smooth(method = "lm", col = "red") +
    facet_grid(key_timepoint ~ key_feature, scales = "free_x") +
    xlab("Feature score") +
    ylab("DamID difference (z-score)") +
    theme_bw() +
    theme(aspect.ratio = 1)
  if (make_plot) {
    plot(plt)
  }
  
  # Prepare correlations
  tib_summary <- tib %>%
  #tib_summary <- tib_filtered %>%
    #filter(! grepl("gene|ctcf", key_feature)) %>%
    group_by(key_feature, key_timepoint) %>%
    dplyr::summarize(cor = cor(feature_score, timepoint_score, 
                               method = "spearman", use = "complete.obs"),
                     pvalue = cor.test(feature_score, timepoint_score, 
                                       method = "spearman", 
                                       use = "complete.obs")$p.value) %>%
    ungroup() %>%
    mutate(key_timepoint = as.character(key_timepoint))
  
  # # Process gene / ctcf density separately - only meaningful for bigger LADs 
  # tib_summary_filtered <- tib_filtered %>%
  #   filter(grepl("gene|ctcf", key_feature)) %>%
  #   group_by(key_feature, key_timepoint) %>%
  #   dplyr::summarize(cor = cor(feature_score, timepoint_score, 
  #                              method = "kendall", use = "complete.obs"),
  #                    pvalue = cor.test(feature_score, timepoint_score, 
  #                                      method = "kendall", 
  #                                      use = "complete.obs")$p.value) %>%
  #   ungroup() %>%
  #   mutate(key_timepoint = as.character(key_timepoint))
  
  #tib_summary <- bind_rows(tib_summary, tib_summary_filtered)
  tib_summary
  
}

# Get all correlations
tib_correlations <- purrr::reduce(purrr::map(gr_LADs_list, Correlations),
                                  bind_rows)

Plot the correlation matrix.

# Process correlations 
tib_correlations <- tib_correlations %>%
  mutate(pvalue = p.adjust(pvalue, method = "BH")) %>%
  separate(key_timepoint, c("condition", "timepoint"), remove = F) %>%
  mutate(condition = factor(condition, levels = levels(tib_metadata$condition)),
         timepoint = factor(timepoint, levels = levels(tib_metadata$timepoint))) %>%
  arrange(condition, timepoint, key_feature) %>%
  mutate(key_timepoint = factor(key_timepoint, levels = unique(key_timepoint))) %>%
  filter(! condition %in% c("CTCFNQ", "CTCF"),
         ! key_feature %in% c("score", "inactive_gene_density"))
 
# Plot
cor_max <- max(abs(tib_correlations$cor))*1.2

plt <- tib_correlations %>%
  ggplot(aes(x = key_timepoint, y = key_feature, fill = cor)) +
  geom_tile() +
  geom_text(aes(label = ifelse(pvalue < 0.05, "*", ""))) +
  xlab("") +
  ylab("") +
  coord_equal() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       limits = c(-cor_max, cor_max)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot(plt)

# Plot without 96h
plt <- tib_correlations %>%
  filter(! timepoint %in% "96h") %>%
  ggplot(aes(x = key_timepoint, y = key_feature, fill = cor)) +
  geom_tile() +
  geom_text(aes(label = ifelse(pvalue < 0.05, "*", ""))) +
  xlab("") +
  ylab("") +
  coord_equal() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       limits = c(-cor_max, cor_max)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot(plt)

# Add numbers and shade p-values (reviewer 2)
plt <- tib_correlations %>%
  filter(! timepoint %in% "96h") %>%
  ggplot(aes(x = key_timepoint, y = key_feature, fill = cor)) +
  geom_tile(aes(color = ifelse(pvalue < 0.05, "*", ""))) +
  geom_text(aes(label = round(cor, 2)),
            size = 2) +
  xlab("") +
  ylab("") +
  coord_equal() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       limits = c(-cor_max, cor_max)) +
  scale_color_manual(values = c("white", "black")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot(plt)

plt <- tib_correlations %>%
  filter(! timepoint %in% "96h") %>%
  ggplot(aes(x = key_timepoint, y = key_feature, fill = cor)) +
  geom_tile() +
  geom_text(aes(label = round(cor, 2),
                color = pvalue < 0.05),
            size = 2) +
  xlab("") +
  ylab("") +
  coord_equal() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       limits = c(-cor_max, cor_max)) +
  scale_color_manual(values = c("grey80", "grey20")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot(plt)

Various ways of plotting. The last is the best.

3. Linear model

As I showed in the previous plots, a combination of (partially correlating) features explain the effects. Here, I will try simple linear modeling and see how much I can explain.

lmp <- function (modelobject) {
  if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
  f <- summary(modelobject)$fstatistic
  p <- pf(f[1],f[2],f[3],lower.tail=F)
  attributes(p) <- NULL
  return(p)
}

LinearModeling <- function(gr, make_plot = T, min_width = 2e5) {
  
  # Prepare tibble - gathered
  tib <- as_tibble(mcols(gr)) %>%
    rename_at(1, ~ "t_0h") %>%
    mutate_at(vars(matches("_[0-9]+h")), function(x) x - .$t_0h) %>%
    dplyr::select(-1) %>%
    gather(key_timepoint, timepoint_score, matches("_[0-9]+h")) %>%
    mutate(key_timepoint = factor(key_timepoint, levels = unique(key_timepoint)))
  
  # Prepare linear models
  tib_lm <- tib %>%
    group_by(key_timepoint) %>%
    nest() %>%
    mutate(fit = map(data, ~ lm(timepoint_score ~ size +
                                  active_gene_density +
                                  #inactive_gene_density +
                                  ctcf_density + 
                                  distance_to_centromere +
                                  distance_to_telomere + 
                                  local_lad_density,
                                  #chrom_size + 
                                  #H3K4me1 +
                                  #H3K27me3 +
                                  #H3K9me2, 
                                data = .x)),
           tidied = map(fit, tidy))
  
  tib_explained <- tib_lm %>%
    mutate(summary = map(fit, summary),
           rsquared = map(summary, function(x) x$adj.r.squared),
           pvalue = map(fit, lmp)) %>%
    unnest(rsquared, pvalue) %>%
    dplyr::select(rsquared, pvalue, key_timepoint) %>%
    mutate(key_timepoint = as.character(key_timepoint))
  
  tib_features <- tib_lm %>%
    unnest(tidied) %>%
    mutate(key_timepoint = as.character(key_timepoint))
  
  list(tib_explained, tib_features)
  
}

# Get all linear models
list_linear_models <- purrr::map(gr_LADs_list, LinearModeling)

tib_linear_models_explained <- purrr::reduce(lapply(list_linear_models, 
                                                    function(x) x[[1]]),
                                             bind_rows) %>%
  ungroup() %>%
  separate(key_timepoint, c("condition", "timepoint"), remove = F) %>%
  filter(! condition %in% c("CTCFNQ", "CTCF")) %>%
  mutate(key_timepoint = factor(key_timepoint, levels = unique(key_timepoint)),
         condition = factor(condition, levels = c("CTCFEL", "RAD21",
                                                  "WAPL", "CTCFWAPL")),
         timepoint = factor(timepoint, levels = c("6h", "24h", "96h")))

tib_linear_models_features <- purrr::reduce(lapply(list_linear_models, 
                                                   function(x) x[[2]]),
                                            bind_rows) %>%
  dplyr::select(-data, -fit) %>%
  ungroup() %>%
  separate(key_timepoint, c("condition", "timepoint"), remove = F) %>%
  filter(! condition %in% c("CTCFNQ", "CTCF")) %>%
  mutate(key_timepoint = factor(key_timepoint, levels = unique(key_timepoint)),
         condition = factor(condition, levels = c("CTCFEL", "RAD21",
                                                  "WAPL", "CTCFWAPL")),
         timepoint = factor(timepoint, levels = c("6h", "24h", "96h")),
         term = factor(term, levels = feature_levels))

# Plot Rsquared and p-values
plt <- tib_linear_models_explained %>%
  filter(timepoint != "96h") %>%
  ggplot(aes(x = key_timepoint, y = rsquared, fill = condition)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Dark2") +
  xlab("") +
  ylab("R-squared") +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)

plt <- tib_linear_models_explained %>%
  filter(timepoint != "96h") %>%
  ggplot(aes(x = key_timepoint, y = -log10(pvalue), fill = condition)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Dark2") +
  xlab("") +
  ylab("-log10(p-value)") +
  scale_y_log10() +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)

# Create heatmap with all features and their significance
plt <- tib_linear_models_features %>%
  filter(! timepoint %in% "96h",
         term != "(Intercept)") %>%
  ggplot(aes(x = key_timepoint, y = term, 
             fill = -log10(p.value) * ifelse(estimate > 0, 1, -1))) +
  geom_tile() +
  geom_text(aes(label = ifelse(p.value < 0.05, "*", ""))) +
  xlab("") +
  ylab("") +
  coord_equal() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", 
                       midpoint = 0, name = "-log10(p-value)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot(plt)

It looks good, but is too complicated. The actual correlations are more transparent.

4. Plots consensus model

Something I should have done before: are the differences between the depletion experiments similar?

# Get consensus data + calculate differences
tib <- as_tibble(mcols(gr_LAD_consensus)) %>%
  mutate(idx = 1:nrow(.),
         seqnames = as.character(seqnames(gr_LAD_consensus)),
         seqnames = factor(seqnames, unique(seqnames))) %>%
  mutate(PT_24h_diff = PT_24h - PT_0h,
         CTCFEL_6h_diff = CTCFEL_6h - CTCFEL_0h,
         CTCFEL_24h_diff = CTCFEL_24h - CTCFEL_0h,
         RAD21_6h_diff = RAD21_6h - RAD21_0h,
         RAD21_24h_diff = RAD21_24h - RAD21_0h,
         WAPL_6h_diff = WAPL_6h - WAPL_0h,
         WAPL_24h_diff = WAPL_24h - WAPL_0h,
         CTCFWAPL_6h_diff = CTCFWAPL_6h - CTCFWAPL_0h,
         CTCFWAPL_24h_diff = CTCFWAPL_24h - CTCFWAPL_0h)


# GGpairs
boundaries <- seq(from = 0, to = 0.7, length.out = 4)

plt <- ggpairs(tib %>%
                 dplyr::select(contains("diff")) %>%
                 drop_na(),
               upper = list(continuous = corColor),
               lower = list(continuous = customScatter),
               diag = list(continuous = function(data, mapping, ...) {
                   ggally_densityDiag(data = data, mapping = mapping, alpha = 0.3, fill = "red") +
                   theme_bw()})) +
  ggtitle("Correlating all data") +
  xlab("") +
  ylab("")

# print(plt)


# Correlation plot
tib_cor <- tib %>%
  dplyr::select(contains("diff")) %>%
  drop_na() %>%
  correlate(method = "spearman") %>%
  gather(key, value, -term) %>%
  mutate(value = replace_na(value, 1)) %>%
  mutate(n1 = str_remove(term, "_diff"),
         n2 = str_remove(key, "_diff"),
         n1 = factor(n1, levels(tib_metadata$sample)),
         n2 = factor(n2, rev(levels(tib_metadata$sample))))

# Plot
plt <- tib_cor %>%
  ggplot(aes(x = n1, y = n2, fill = value, label = round(value, 2))) +
  geom_tile() +
  geom_text() +
  xlab("") + ylab("") +
  scale_fill_gradientn(colors = colorRampPalette(rev(brewer.pal(n = 7, 
                                                                name = "RdYlBu")))(100),
                       limits = c(min(-1, tib_cor$value), 1),
                       name = "Spearman correlation") +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)

# Prepare differences for plotting
tib_diff <- tib %>%
  dplyr::select(-ends_with("h"))

tib_diff %>%
  ggplot(aes(x = RAD21_24h_diff, y = CTCFWAPL_24h_diff, 
             col = distance_to_telomere)) +
  geom_point() +
  geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  coord_cartesian(xlim = c(-1, 1), ylim = c(-1, 1)) +
  ggtitle(round(cor(tib_diff$RAD21_24h_diff, tib_diff$CTCFEL_24h_diff)), 2) +
  scale_color_gradient2(low = "blue", mid = "grey", high = "red",
                        midpoint = median(tib_diff$distance_to_telomere)) +
  theme_bw() +
  theme(aspect.ratio = 1)

tib_plot <- tib_diff %>%
  gather(key, value, contains("24h")) %>%
  mutate(key = str_remove(key, "_diff"),
         key = factor(key, levels(tib_metadata$sample))) 

tib_plot %>%
  ggplot(aes(x = distance_to_telomere, y = value)) +
  geom_point(size = 1, alpha = 0.3) +
  geom_smooth(method = "lm", col = "red") +
  geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  #coord_cartesian(ylim = c(-1, 1)) +
  xlab("Distance to telomere (Mb)") +
  ylab("LAD difference") +
  facet_grid(. ~ key) +
  theme_bw() +
  theme(aspect.ratio = 1)

tib_plot %>%
  ggplot(aes(x = distance_to_telomere, y = value, col = seqnames)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "lm", col = "red") +
  geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  #coord_cartesian(ylim = c(-1, 1)) +
  xlab("Distance to telomere (Mb)") +
  ylab("LAD difference") +
  facet_grid(. ~ key) +
  theme_bw() +
  theme(aspect.ratio = 1)

Conclusion: most LAD differences are very independent. Exception is the WAPL and CTCF/WAPL depletion. Makes sense.

Next, make plots of the LAD size. Visualize that this is an important feature.

# Change size to Mb
tib_plot <- tib_plot %>%
  mutate(size_Mb = 2**size - 1,
         size_Mb = size_Mb / 1e6)


tib_plot %>%
  ggplot(aes(x = size_Mb, y = value)) +
  geom_point(size = 1, alpha = 0.3) +
  geom_smooth(method = "lm", col = "red") +
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  #coord_cartesian(ylim = c(-1, 1)) +
  xlab("LAD size (Mb)") +
  ylab("LAD difference") +
  facet_grid(. ~ key) +
  scale_x_log10() +
  theme_bw() +
  theme(aspect.ratio = 1)

tib_plot %>%
  ggplot(aes(x = size_Mb, y = value, col = seqnames)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "lm", col = "red") +
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  #coord_cartesian(ylim = c(-1, 1)) +
  xlab("LAD size (Mb)") +
  ylab("LAD difference") +
  facet_grid(. ~ key) +
  scale_x_log10() +
  theme_bw() +
  theme(aspect.ratio = 1)

# Also, plot CTCF density
tib_plot %>%
  ggplot(aes(x = ctcf_density, y = value)) +
  geom_point(size = 1, alpha = 0.3) +
  geom_smooth(method = "lm", col = "red") +
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  #coord_cartesian(ylim = c(-1, 1)) +
  xlab("LAD size (Mb)") +
  ylab("LAD difference") +
  facet_grid(. ~ key) +
  scale_x_log10() +
  theme_bw() +
  theme(aspect.ratio = 1)

5. Differential analysis based on PT values

Above, I tried to provide a convincing story using a correlation matrix. This resulted in some resistance. Actual numbers of different regions were requested. Here, I will do it differently. I will use the new PT data to estimate a background distribution of differences, and use those to determine LADs that are significantly different.

#######################################
### Determine the LAD differences

tib_diff <- as_tibble(mcols(gr_LAD_consensus)) %>%
  mutate(PT_diff_24h = PT_24h - PT_0h,
         CTCFEL_diff_6h = CTCFEL_6h - CTCFEL_0h,
         CTCFEL_diff_24h = CTCFEL_24h - CTCFEL_0h,
         RAD21_diff_6h = RAD21_6h - RAD21_0h,
         RAD21_diff_24h = RAD21_24h - RAD21_0h,
         WAPL_diff_6h = WAPL_6h - WAPL_0h,
         WAPL_diff_24h = WAPL_24h - WAPL_0h,
         CTCFWAPL_diff_6h = CTCFWAPL_6h - CTCFWAPL_0h,
         CTCFWAPL_diff_24h = CTCFWAPL_24h - CTCFWAPL_0h)


#######################################
### Plot the differences
clones <- c("PT", "CTCFEL", "RAD21", "WAPL", "CTCFWAPL")
timepoints <- c("0h", "6h", "24h")

tib_diff_toplot <- tib_diff %>%
  gather(key, value, contains("diff")) %>%
  mutate(clone = str_remove(key, "_.*"),
         timepoint = str_remove(key, ".*_")) %>%
  mutate(clone = factor(clone, levels = clones),
         timepoint = factor(timepoint, levels = timepoints))

sd_fun <- function(x){
  return(data.frame(y = 1.25, label = round(sd(x), 2)))
}

tib_diff_toplot %>%
  ggplot(aes(x = clone, y = value, col = clone)) +
  geom_quasirandom() +
  geom_boxplot(fill = NA, outlier.shape = NA, col = "black") +
  stat_summary(fun.data = sd_fun, geom = "text", col = "black") +
  geom_hline(yintercept = 0, col = "blue", linetype = "dashed") +
  facet_grid(. ~ timepoint) +
  xlab("") +
  ylab("LAD difference (z-score)") +
  scale_color_brewer(palette = "Dark2") +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))

#######################################
### Repeat plot, but highlight "differential LADs"

diff_range <- quantile(tib_diff$PT_diff_24h, c(0.001, 0.999), na.rm = T)

tib_diff_toplot <- tib_diff_toplot %>%
  mutate(diff = value < diff_range[1] | value > diff_range[2],
         class = case_when(value < diff_range[1] ~ "down",
                           value > diff_range[2] ~ "up", 
                           T ~ "stable"))

# Same plot as before
tib_diff_toplot %>%
  ggplot(aes(x = clone, y = value, col = diff)) +
  geom_quasirandom() +
  geom_boxplot(fill = NA, outlier.shape = NA, col = "black") +
  stat_summary(fun.data = sd_fun, geom = "text", col = "black") +
  geom_hline(yintercept = 0, col = "blue", linetype = "dashed") +
  facet_grid(. ~ timepoint) +
  xlab("") +
  ylab("LAD difference (z-score)") +
  scale_color_manual(values = c("grey70", "red")) +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))

# Amount of differential LADs
tib_diff_toplot %>%
  group_by(clone, timepoint) %>%
  dplyr::summarise(diff_n = sum(diff, na.rm = T),
                   diff_fraction = mean(diff, na.rm = T)) %>%
  ungroup() %>%
  add_row(clone = "PT", timepoint = "6h", diff_fraction = 0, diff_n = 0) %>%
  mutate(clone = factor(clone, levels = clones),
         timepoint = factor(timepoint, levels = timepoints)) %>%
  
  ggplot(aes(x = clone, y = diff_fraction, fill = timepoint)) + 
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = diff_n, y = diff_fraction + 0.005), 
            position = position_dodge(width = 0.9),
            angle = 90, hjust = 0) +
  xlab("") +
  ylab("Fraction LADs different") +
  scale_fill_grey() +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))

The results above provide numbers how many LADs are different. Bas liked it. I can add this to the manuscript.

Next, I tried to compare differential LADs with the LAD features.

#######################################
### Compare differential groups with features

# Prepare tibble with features
tib_diff_features <- tib_diff_toplot %>%
  dplyr::select(-ends_with("h")) %>%
  dplyr::select(-ATAC, -H3K27ac, -H3K36me3, -H3K4me1, -H3K4me3) %>%
  gather(feature, feature_value, 
         -key, -clone, -value, -timepoint, -diff, -class) %>%
  mutate(feature = factor(feature,
                          levels = c("size", 
                                     "ctcf_density",
                                     "active_gene_density",
                                     "H3K27me3",
                                     "H3K9me2",
                                     "local_lad_density",
                                     "distance_to_centromere",
                                     "distance_to_telomere",
                                     "chrom_size"))) %>%
  filter(clone != "PT") %>%
  arrange(feature, clone, timepoint) %>%
  mutate(key = factor(key, levels = unique(key)))

# Plot as boxplot
tib_diff_features %>%
  ggplot(aes(x = timepoint, y = feature_value, fill = class, 
             col = class, group = interaction(timepoint, class))) +
  geom_boxplot(outlier.shape = NA, col = "black",
               position = position_dodge(width = 1)) +
  facet_grid(feature ~ clone, scales = "free_y") +
  scale_fill_manual(values = c("blue", "grey70", "red")) +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))

#######################################
### Determine significant correlations

wilcox_test_against_stable <- function(x, class, effect = "down") {
  wilcox.test(x[class == "stable"], x[class == effect])$p.value
}

direction_against_stable <- function(x, class, effect = "down") {
  ifelse(mean(x[class == "stable"], na.rm = T) > 
           mean(x[class == effect], na.rm = T),
         "lower", "higher")
}

# Determine significance for up and down separately
tib_diff_sign <- tib_diff_features %>%
  group_by(key, clone, timepoint, feature) %>%
  dplyr::summarise(down = wilcox_test_against_stable(feature_value, class, "down"),
                   down_direction = direction_against_stable(feature_value, class, "down"),
                   up = wilcox_test_against_stable(feature_value, class, "up"),
                   up_direction = direction_against_stable(feature_value, class, "up")) %>%
  ungroup() %>%
  gather(direction, pvalue, down, up) %>%
  mutate(padj = p.adjust(pvalue),
         sign = padj < 0.05,
         feature = factor(feature, levels = rev(levels(feature))),
         key = str_remove(key, "_diff"),
         key = factor(key, levels = unique(key))) %>%
  # Determine -log10(padj), where the direction is up/down
  mutate(padj_log10 = -log10(padj),
         padj_log10 = case_when(direction == "down" & 
                                  down_direction == "lower" ~ -padj_log10,
                                direction == "up" & 
                                  up_direction == "lower" ~ -padj_log10,
                                T ~ padj_log10))

# Plot
tib_diff_sign %>%
  ggplot(aes(x = key, y = feature, fill = padj_log10, label = round(padj_log10, 0))) +
  geom_tile() +
  geom_text(data = tib_diff_sign %>% filter(sign == T)) +
  facet_grid(. ~ direction) +
  xlab("") + 
  ylab("") +
  scale_fill_gradient2(high = "red", mid = "white", low = "blue", midpoint = 0) +
  theme_bw() +
  theme(aspect.ratio = 9/8,
        axis.text.x = element_text(angle = 90, hjust = 1))

I don’t like this. As you can see in the LAD size figure, small LADs are simply more variable between conditions. I think that this is because small LADs have fewer bins and therefore more intrinsic variation in their scores. I want to include the “differential analysis”, but that use the correlation matrix afterwards. This prevents the wrong message, that small LADs are both increasing and decreasing.

Conclusion

LAD features can explain a portion of the data. This is nice to add to the manuscript. It doesn’t functionally explain the data, but that’s also not the goal.

SessionInfo

sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.33           RColorBrewer_1.1-2   corrr_0.4.3         
##  [4] broom_0.7.9          GGally_2.1.2         ggbeeswarm_0.6.0    
##  [7] rtracklayer_1.50.0   GenomicRanges_1.42.0 GenomeInfoDb_1.26.7 
## [10] IRanges_2.24.1       S4Vectors_0.28.1     BiocGenerics_0.36.1 
## [13] forcats_0.5.1        stringr_1.4.0        dplyr_1.0.7         
## [16] purrr_0.3.4          readr_2.0.0          tidyr_1.1.3         
## [19] tibble_3.1.6         ggplot2_3.3.5        tidyverse_1.3.1     
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-152                bitops_1.0-7               
##  [3] matrixStats_0.60.0          fs_1.5.0                   
##  [5] bit64_4.0.5                 lubridate_1.7.10           
##  [7] httr_1.4.2                  tools_4.0.5                
##  [9] backports_1.2.1             bslib_0.2.5.1              
## [11] utf8_1.2.2                  R6_2.5.1                   
## [13] vipor_0.4.5                 mgcv_1.8-36                
## [15] DBI_1.1.1                   colorspace_2.0-2           
## [17] withr_2.4.2                 tidyselect_1.1.1           
## [19] bit_4.0.4                   compiler_4.0.5             
## [21] cli_3.1.0                   rvest_1.0.1                
## [23] Biobase_2.50.0              xml2_1.3.2                 
## [25] DelayedArray_0.16.3         labeling_0.4.2             
## [27] sass_0.4.0                  scales_1.1.1               
## [29] digest_0.6.28               Rsamtools_2.6.0            
## [31] rmarkdown_2.11              XVector_0.30.0             
## [33] pkgconfig_2.0.3             htmltools_0.5.1.1          
## [35] MatrixGenerics_1.2.1        highr_0.9                  
## [37] dbplyr_2.1.1                rlang_0.4.12               
## [39] readxl_1.3.1                rstudioapi_0.13            
## [41] farver_2.1.0                jquerylib_0.1.4            
## [43] generics_0.1.0              jsonlite_1.7.2             
## [45] vroom_1.5.3                 BiocParallel_1.24.1        
## [47] RCurl_1.98-1.3              magrittr_2.0.1             
## [49] GenomeInfoDbData_1.2.4      Matrix_1.3-4               
## [51] Rcpp_1.0.7                  munsell_0.5.0              
## [53] fansi_0.5.0                 lifecycle_1.0.1            
## [55] stringi_1.7.3               yaml_2.2.1                 
## [57] SummarizedExperiment_1.20.0 zlibbioc_1.36.0            
## [59] plyr_1.8.6                  grid_4.0.5                 
## [61] crayon_1.4.2                lattice_0.20-44            
## [63] splines_4.0.5               Biostrings_2.58.0          
## [65] haven_2.4.1                 hms_1.1.0                  
## [67] pillar_1.6.4                codetools_0.2-18           
## [69] reprex_2.0.0                XML_3.99-0.6               
## [71] glue_1.5.0                  evaluate_0.14              
## [73] modelr_0.1.8                vctrs_0.3.8                
## [75] tzdb_0.1.2                  cellranger_1.1.0           
## [77] gtable_0.3.0                reshape_0.8.8              
## [79] assertthat_0.2.1            xfun_0.24                  
## [81] GenomicAlignments_1.26.0    beeswarm_0.4.0             
## [83] ellipsis_0.3.2